library(dplyr)
library(ggplot2)
library(DESeq2)
## Warning: package 'GenomeInfoDb' was built under R version 4.1.1
## Warning: package 'MatrixGenerics' was built under R version 4.1.1
library(tximport)
library(readr)
library(tximportData)
library(readxl)
library(knitr)
library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(viridis)
library(ggrepel)
library(EnhancedVolcano)
library(fgsea)
library(limma)
Plasma 1:10
Viridis 1:10
Cividis 1:10
Magma 1:10
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.
##This section of code will generate a summary table of the samples sequenced and their sequencing and alignment metrics
| Sample | patient | reads | %Q30 | Duplication rate | % reads with adapter | STAR alignment number | percent aligned | splices annotated | salmon mapping |
|---|---|---|---|---|---|---|---|---|---|
| Bulk | HTB314 | 71816976 | 94.3 | 57 | 2.6 | 63614504 | 88.58 | 27340654 | 85.6967 |
| Bulk | MDS268 | 31441538 | 93.0 | 24 | 2.3 | 25549835 | 81.26 | 5546270 | 58.2075 |
| Bulk | MDS280 | 28794421 | 93.0 | 24 | 2.3 | 23317205 | 80.98 | 5115546 | 58.8614 |
| CD123+ | HTB314 | 61568500 | 94.0 | 56 | 2.7 | 55359277 | 89.91 | 22831187 | 86.4197 |
| CD123+ | MDS268 | 32307881 | 93.0 | 27 | 3.3 | 26243441 | 81.23 | 7290861 | 64.2150 |
| CD123+ | MDS280 | 23745205 | 93.0 | 34 | 2.6 | 21035358 | 88.59 | 5668653 | 71.3245 |
| CD123- | HTB314 | 83260626 | 94.0 | 58 | 2.6 | 74804722 | 89.84 | 31876605 | 87.6505 |
| CD123- | MDS268 | 27917999 | 93.0 | 26 | 2.7 | 22835189 | 81.79 | 5793322 | 63.9284 |
| CD123- | MDS280 | 24767573 | 93.0 | 32 | 2.6 | 22467937 | 90.72 | 4773823 | 62.7193 |
##This section of code will import and format the data for DeSeq2
#to generate a vector of names and file locations
files<-file.path("~/Desktop/Jordan files/Counts/salmon/salmon/", list.files("~/Desktop/Jordan files/Counts/salmon/salmon/"), "quant.sf")
names(files)<-list.files("~/Desktop/Jordan files/Counts/salmon/salmon")
#to call in a gene_map this was derived by pulling it out of the fasta file with a grep command for ENST and ENSG and pasting them together
gene_map=read_csv("~/Desktop/Jordan files/Counts/salmon/gene_map.csv", col_names=c('enstid', 'ensgid'))
# import transcript level counts
txi.salmon.t<-tximport(files, type="salmon", txOut=TRUE)
txi.salmon.g<-tximport(files=files, type="salmon", tx2gene= gene_map, ignoreTxVersion = TRUE, countsFromAbundance = 'lengthScaledTPM' )
### this code works but if I remove the ignoreTxVersion I get an error, this may have to do with how I am generating my tx2gene file
# Extract counts only
counts <- txi.salmon.g$counts %>%
as.data.frame()
#Extract TPM
tpms <- data.frame(txi.salmon.g$abundance)
##for clients the counts and tpm files should be written out
##This chunk of code sets an expression threshold for TMP where all samples have at least 5 counts
expressed_genes<-tpms %>% filter_all(all_vars(. > 5))
files.rna<-file.path("~/Desktop/Jordan files/counts.rRNA/rRNA/", list.files("~/Desktop/Jordan files/counts.rRNA/rRNA"), "quant.sf")
names(files.rna)<-list.files("~/Desktop/Jordan files/counts.rRNA/rRNA")
mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl", host='www.ensembl.org')
t2g_hs <- biomaRt::getBM(attributes = c('ensembl_transcript_id', 'ensembl_gene_id', 'external_gene_name', 'refseq_mrna'), mart = mart)
tx2gene <- t2g_hs[,c(1,2)]
tx2gene[nrow(tx2gene)+1,]=c('rRNA_45S_NR145819','rRNA_45S_NR145819')
txi.salmon.rRNA.g<-tximport(files=files.rna, type="salmon", tx2gene= tx2gene, ignoreTxVersion = TRUE, countsFromAbundance = 'lengthScaledTPM' )
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9
## removing duplicated transcript rows from tx2gene
## summarizing abundance
## summarizing counts
## summarizing length
metadata<-read_table2("~/Desktop/Jordan files/Counts/sample.txt")
## Warning: `read_table2()` was deprecated in readr 2.0.0.
## Please use `read_table()` instead.
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## SampleName = col_character(),
## FileName = col_character(),
## patient = col_character(),
## cellType = col_character()
## )
sums<-colSums(txi.salmon.rRNA.g$counts)
sums<-as.data.frame(sums)
counts.rRNA<-as.data.frame(txi.salmon.rRNA.g$counts)
rna.counts<-as.data.frame(t(slice_tail(counts.rRNA)))
rna.counts <-merge(rna.counts,sums,by='row.names',all=TRUE)
rna.counts<-mutate(rna.counts, "percentrRNA"= (rRNA_45S_NR145819 /sums)*100 )
names(rna.counts)[1] <- 'SampleName'
rna.counts<-select(rna.counts, c("SampleName", "percentrRNA"))
metadata<-dplyr::full_join(metadata,rna.counts, by="SampleName")
metadata$percentrRNA <- round(metadata$percentrRNA ,digit=2)
##This chunk of code generates a heatmap using the spearman method as well as a correlation heatmap ## PCA plot
exp.pca <- prcomp(t(log2(expressed_genes)))
exp.pca.summary <- summary(exp.pca)$importance
pc1var = round(exp.pca.summary[3,1] * 100, 1)
pc2var = round(exp.pca.summary[3,2] * 100 - pc1var, 1)
exp.pca.pc <- data.frame(exp.pca$x, sample = colnames(expressed_genes))
## 3. Run Differential Expression testing using DESeq2 and Calculate Gene Set Enrichment ## Compare 123pos vs 123neg, 123neg vs bulk, and 123pos vs bulk #### sig = padj <0.01 and abs(l2fc) >0.5 ####
coldata<-dplyr::select(metadata, -FileName)
coldata<-column_to_rownames(coldata, 'SampleName')
## need to confirm that all names are in the same order
all(rownames(coldata) %in% colnames(txi.salmon.g$counts))
## [1] TRUE
all(rownames(coldata) == colnames(txi.salmon.g$counts))
## [1] FALSE
coldata<-coldata[colnames(txi.salmon.g$counts),]
all(rownames(coldata) == colnames(txi.salmon.g$counts))
## [1] TRUE
dds <- DESeqDataSetFromTximport(txi.salmon.g,
colData = coldata,
design = ~ patient + cellType)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## using just counts from tximport
dds<-estimateSizeFactors(dds)
sf<-as.data.frame(dds$sizeFactor)
sf<-rownames_to_column(sf, var="SampleName")
metadata<-inner_join(metadata, sf, by="SampleName")
names(metadata)[6] <- "sizeFactor"
metadata$sizeFactor <- round(metadata$sizeFactor ,digit=2)
### unfiltered data
dds.unfiltered <- DESeq(dds)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res.unfiltered <- results( dds.unfiltered)
##filtered data changed this from dds[rowMins(counts(dds))>5,] to a less stringent filtering
keep<-rowSums(counts(dds))>=10
#dds.filtered<-dds[rowMins(counts(dds))>5,]
dds.filtered<-dds[keep,]
dds.filtered<-DESeq(dds.filtered)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res.filtered<-results(dds.filtered)
###outcome the first filtering cut from 60000 genese to 14000 genes with the new filter 35000genes are left which is about 1/2
### need to subset to do pairwise comparison unclear if this keeps the patient design aspect
res_123posvs123neg_unfiltered<-results( dds.unfiltered, contrast=c("cellType", "123pos", "123neg"))
res_123posvsBulk_unfiltered<-results( dds.unfiltered, contrast=c("cellType", "123pos", "bulk"))
res_123negvsBulk_unfiltered<-results( dds.unfiltered, contrast=c("cellType", "123neg", "bulk"))
res_123posvs123neg_filtered<-results(dds.filtered, contrast=c("cellType", "123pos", "123neg"))
res_123posvsBulk_filtered<-results(dds.filtered, contrast=c("cellType", "123pos", "bulk"))
res_123negvsBulk_filtered<-results(dds.filtered, contrast=c("cellType", "123neg", "bulk"))
## look at the numbers of genes meeting threshold the log fold change call is not changing things
sum( res_123posvs123neg_unfiltered$pvalue < 0.01 & abs(res_123posvs123neg_unfiltered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 1405
sum( res_123posvsBulk_unfiltered$pvalue < 0.01 & abs(res_123posvsBulk_unfiltered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 2382
sum(res_123negvsBulk_unfiltered$pvalue < 0.01 & abs(res_123negvsBulk_unfiltered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 2336
sum( res_123posvs123neg_filtered$pvalue < 0.01 & abs(res_123posvs123neg_filtered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 1206
sum( res_123posvsBulk_filtered$pvalue < 0.01 & abs(res_123posvsBulk_filtered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 2157
sum(res_123negvsBulk_filtered$pvalue < 0.01 & abs(res_123negvsBulk_filtered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 2130
sum(res_123posvs123neg_filtered$padj < 0.1, na.rm=TRUE)
## [1] 1000
sum(res_123posvs123neg_unfiltered$padj < 0.1, na.rm=TRUE)
## [1] 1180
## Warning: Removed 19307 rows containing missing values (geom_point).
## Warning: Removed 20003 rows containing missing values (geom_point).
## Warning: Removed 20003 rows containing missing values (geom_point).
###MA plots
pca_data<-vst(dds, blind=T)
ntop=500
rv <- rowVars(assay(pca_data))
select <- order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))]
pca <- prcomp(t(assay(pca_data)[select,]))
percentVar2 <- data.frame(percentVar=pca$sdev^2/sum(pca$sdev^2))%>%
mutate(pc=1:n())%>%dplyr::select(pc, percentVar)
pca_df<-pca$x%>%data.frame()%>%mutate(type=metadata$cellType,percent=metadata$percentrRNA)
alt_col_values=c("#88CCEE", "#CC6677", "#DDCC77")
ggplot(pca_df, aes(PC1, PC2, color=type, shape=metadata$patient))+geom_point(size=3)+
xlab(paste0("PC1: ",round(percentVar2$percentVar[1] * 100), "% variance")) +
ylab(paste0("PC2: ", round(percentVar2$percentVar[2] * 100), "% variance"))+
scale_color_manual(values=alt_col_values)+labs(color="")+theme_bw()+
ggtitle("PCA of analyzed data")
pca_data_limma<-limma::removeBatchEffect(assay(pca_data), pca_data$patient )
#rv <- rowVars(assay(pca_data_limma))
#select <- order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))]
pca_limma <- prcomp(t(pca_data_limma))
percentVar3 <- data.frame(percentVar=pca_limma$sdev^2/sum(pca_limma$sdev^2))%>%
mutate(pc=1:n())%>%dplyr::select(pc, percentVar)
pca_df_limma<-pca_limma$x%>%data.frame()%>%mutate(type=metadata$cellType)
ggplot(pca_df_limma, aes(PC1, PC2, color=type, shape=metadata$patient))+geom_point(size=3)+
xlab(paste0("PC1: ",round(percentVar3$percentVar[1] * 100), "% variance")) +
ylab(paste0("PC2: ", round(percentVar3$percentVar[2] * 100), "% variance"))+
scale_color_manual(values=alt_col_values)+labs(color="")+theme_bw()+
ggtitle("PCA of analyzed data with limma")
pca_data2<-vst(dds.filtered, blind=T)
ntop=500
rv <- rowVars(assay(pca_data2))
select <- order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))]
pca2 <- prcomp(t(assay(pca_data2)[select,]))
percentVar4 <- data.frame(percentVar=pca2$sdev^2/sum(pca2$sdev^2))%>%
mutate(pc=1:n())%>%dplyr::select(pc, percentVar)
pca_df2<-pca2$x%>%data.frame()%>%mutate(type=metadata$cellType, percent=metadata$percentrRNA)
alt_col_values=c("#88CCEE", "#CC6677", "#DDCC77")
ggplot(pca_df2, aes(PC1, PC2, color=type, shape=metadata$patient))+geom_point(size=3)+
xlab(paste0("PC1: ",round(percentVar4$percentVar[1] * 100), "% variance")) +
ylab(paste0("PC2: ", round(percentVar4$percentVar[2] * 100), "% variance"))+
scale_color_manual(values=alt_col_values)+labs(color="")+theme_bw()+
ggtitle("PCA of analyzed data using filtered data")
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
##generates a plot of FBX015 a gene in paper up in 123+
d<-plotCounts(dds, gene="ENSG00000141665",intgroup=c("cellType","patient"), returnData = TRUE)
ggplot(d, aes(x=cellType, y=count, color=patient))+geom_point()
##generates a plot of EEFSEC a gene in paper up in 123+
d<-plotCounts(dds, gene="ENSG00000132394",intgroup=c("cellType","patient"), returnData = TRUE)
ggplot(d, aes(x=cellType, y=count, color=patient))+geom_point()
##generates a plot of TGM4 a gene in paper down in 123+
d<-plotCounts(dds, gene="ENSG00000163810",intgroup=c("cellType","patient"), returnData = TRUE)
ggplot(d, aes(x=cellType, y=count, color=patient))+geom_point()
##generates a plot of CRLF2 a gene in paper down in 123+
d<-plotCounts(dds, gene="ENSG00000163810",intgroup=c("cellType","patient"), returnData = TRUE)
ggplot(d, aes(x=cellType, y=count, color=patient))+geom_point()
###GSEA analysis
##generate gene names to go with ensembl gene id as the pathways will use gene name
mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl", host='www.ensembl.org')
t2g_hs <- biomaRt::getBM(attributes = c('ensembl_transcript_id', 'ensembl_gene_id', 'external_gene_name', 'refseq_mrna'), mart = mart)
ens2gene <- t2g_hs[,c(2,3)]
colnames(ens2gene)[2] <- 'Gene'
ens2gene <- unique(ens2gene)
##loading hallmark pathways and KEGG pathways
pathways.hallmark <- gmtPathways("~/Desktop/Jordan files/h.all.v7.4.symbols.gmt")
pathways.kegg<-gmtPathways("~/Desktop/Jordan files/c2.cp.kegg.v7.4.symbols.gmt")
##generating tidy data and adding it to the results file
##123pos vs 123neg analysis
res_posneg_gsea<-results(dds.filtered, contrast=c("cellType", "123pos", "123neg"),tidy=TRUE)
colnames(res_posneg_gsea)[1]<-"ensembl_gene_id"
res_posneg_gsea <- inner_join(res_posneg_gsea,ens2gene,by="ensembl_gene_id")
res_posneg_gsea2<-res_posneg_gsea%>%
dplyr::select(Gene, stat) %>%
na.omit() %>%
distinct() %>%
group_by(Gene) %>%
summarise(stat=mean(stat))
rank_posneg<-deframe(res_posneg_gsea2)
fgseaRes_posneg <- fgsea(pathways=pathways.hallmark, stats=rank_posneg)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.06% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_posneg_kegg<-fgsea(pathways=pathways.kegg, stats=rank_posneg)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.06% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_posneg_tidy <-fgseaRes_posneg%>%
as_tibble() %>%
arrange(desc(NES))
fgseaRes_posneg_kegg_tidy <-fgseaRes_posneg_kegg%>%
as_tibble() %>%
arrange(desc(NES))
##123 pos vs bulk analysis
res_posbulk_gsea<-results(dds.filtered, contrast=c("cellType", "123pos", "bulk"),tidy=TRUE)
colnames(res_posbulk_gsea)[1]<-"ensembl_gene_id"
res_posbulk_gsea <- inner_join(res_posbulk_gsea,ens2gene,by="ensembl_gene_id")
res_posbulk_gsea2<-res_posbulk_gsea%>%
dplyr::select(Gene, stat) %>%
na.omit() %>%
distinct() %>%
group_by(Gene) %>%
summarise(stat=mean(stat))
rank_posbulk<-deframe(res_posbulk_gsea2)
fgseaRes_posbulk<- fgsea(pathways=pathways.hallmark, stats=rank_posbulk)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.26% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_posbulk_kegg<-fgsea(pathways=pathways.kegg, stats=rank_posbulk)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.26% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_posbulk_tidy <-fgseaRes_posbulk%>%
as_tibble() %>%
arrange(desc(NES))
fgseaRes_posbulk_kegg_tidy <-fgseaRes_posbulk_kegg%>%
as_tibble() %>%
arrange(desc(NES))
##123 neg vs bulk analysis
res_negbulk_gsea<-results(dds.filtered, contrast=c("cellType", "123neg", "bulk"),tidy=TRUE)
colnames(res_negbulk_gsea)[1]<-"ensembl_gene_id"
res_negbulk_gsea <- inner_join(res_negbulk_gsea,ens2gene,by="ensembl_gene_id")
res_negbulk_gsea2<-res_negbulk_gsea%>%
dplyr::select(Gene, stat) %>%
na.omit() %>%
distinct() %>%
group_by(Gene) %>%
summarise(stat=mean(stat))
rank_negbulk<-deframe(res_negbulk_gsea2)
fgseaRes_negbulk<- fgsea(pathways=pathways.hallmark, stats=rank_negbulk)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.4% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_negbulk_kegg<-fgsea(pathways=pathways.kegg, stats=rank_negbulk)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.4% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_negbulk_tidy <-fgseaRes_negbulk%>%
as_tibble() %>%
arrange(desc(NES))
fgseaRes_negbulk_kegg_tidy <-fgseaRes_negbulk_kegg%>%
as_tibble() %>%
arrange(desc(NES))
fgseaRes_posneg_tidy%>%
dplyr::select(-leadingEdge, -ES) %>%
arrange(padj) %>%
DT::datatable(caption = 'Table 1: Hallmark genes with 123pos vs 123neg.')
fgseaRes_posneg_kegg_tidy%>%
dplyr::select(-leadingEdge, -ES) %>%
arrange(padj) %>%
DT::datatable(caption = 'Table 1: KEGG pathways with 123pos vs 123neg.')
fgseaRes_posbulk_tidy%>%
dplyr::select(-leadingEdge, -ES) %>%
arrange(padj) %>%
DT::datatable(caption = 'Table 1: Hallmark genes with 123pos vs bulk.')
fgseaRes_posbulk_kegg_tidy%>%
dplyr::select(-leadingEdge, -ES) %>%
arrange(padj) %>%
DT::datatable(caption = 'Table 1: KEGG pathways with 123pos vs bulk.')
fgseaRes_negbulk_tidy%>%
dplyr::select(-leadingEdge, -ES) %>%
arrange(padj) %>%
DT::datatable(caption = 'Table 1: Hallmark genes with 123neg vs bulk.')
fgseaRes_posneg_kegg_tidy%>%
dplyr::select(-leadingEdge, -ES) %>%
arrange(padj) %>%
DT::datatable(caption = 'Table 1: KEGG pathways with 123neg vs bulk.')
ggplot(fgseaRes_posneg_tidy, aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.1)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Hallmark pathways NES from GSEA 123pos vs 123neg") +
theme_minimal()
ggplot(fgseaRes_posneg_kegg_tidy, aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.1)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Kegg pathways NES from GSEA 123pos vs 123neg") +
theme_minimal()
ggplot(fgseaRes_posbulk_tidy, aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.1)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Hallmark pathways NES from GSEA 123pos vs bulk") +
theme_minimal()
ggplot(fgseaRes_posbulk_kegg_tidy, aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.1)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Kegg pathways NES from GSEA 123pos vs bulk") +
theme_minimal()
ggplot(fgseaRes_negbulk_tidy, aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.1)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Hallmark pathways NES from GSEA 123neg vs bulk") +
theme_minimal()
ggplot(fgseaRes_negbulk_kegg_tidy, aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.1)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Kegg pathways NES from GSEA 123neg vs bulk ") +
theme_minimal()